Split column in to multiple columns

Create DataFrame
val empDF = spark.createDataFrame(Seq(
      (7369, "RAJ SEKHAR", "CLERK", 7902, "17-Dec-80", 800, 20, 10),
      (7499, "SMITH ALLEN", "SALESMAN", 7698, "20-Feb-81", 1600, 300, 30),
      (7521, "JAMES WARD", "SALESMAN", 7698, "22-Feb-81", 1250, 500, 30),
      (7566, "JONES SMITH ", "MANAGER", 7839, "2-Apr-81", 2975, 0, 20),
      (7654, "MARTIN USHA", "SALESMAN", 7698, "28-Sep-81", 1250, 1400, 30),
      (7698, "JEMS BLAKE", "MANAGER", 7839, "1-May-81", 2850, 0, 30),
      (7782, "MARK CLARK", "MANAGER", 7839, "9-Jun-81", 2450, 0, 10),
      (7788, "TOM SCOTT", "ANALYST", 7566, "19-Apr-87", 3000, 0, 20),
      (7839, "RAJ KING", "PRESIDENT", 0, "17-Nov-81", 5000, 0, 10),
      (7844, "BOLT TURNER", "SALESMAN", 7698, "8-Sep-81", 1500, 0, 30),
      (7876, "YEND ADAMS", "CLERK", 7788, "23-May-87", 1100, 0, 20)
    )).toDF("empno", "ename", "job", "mgr", "hiredate", "sal", "comm", "deptno")

DataFrame Schema before Split
empDF.printSchema

DataFrame Schema After Spiting  ename column into First Name and last name
import org.apache.spark.sql.functions._
val empDF1 = empDF.select(split(col("ename")," ").getItem(0).as("FirstName"),split(col("ename")," ").getItem(1).as("LastName")).drop("name")
empDF1.printSchema
import org.apache.spark.sql.functions._
val empDF1 = empDF.select(col("empno"),
                                   split(col("ename")," ").getItem(0).as("FirstName"),
                                   split(col("ename")," ").getItem(1).as("LastName"),
                                   col("job"),col("mgr"),col("hiredate"),
                                   col("sal"),col("comm"),col("deptno")).drop("name")
empDF1.printSchema
import org.apache.spark.sql.functions._
val empDF1 = empDF.select(split(col("ename")," ").getItem(0).as("FirstName"),
                                              split(col("ename")," ").getItem(1).as("LastName"),
                                              split(col("hiredate"),"-").getItem(0).as("Day"),
                                              split(col("hiredate"),"-").getItem(1).as("Month"),
                                              split(col("hiredate"),"-").getItem(2).as("Year")).show()

No comments:

Post a Comment